import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings
filterwarnings("ignore")
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\diabetes.csv")
data
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
data.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
data.tail()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
data.describe()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
data.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
data.duplicated().sum()
0
data.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
data.shape
(768, 9)
#VISUALIZATION
plt.bar(data['Pregnancies'],data['Age'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='Glucose',y='Age',color='Glucose')
fig.show()
fig=px.violin(data,x='Pregnancies',y='Age',color='Pregnancies')
fig.show()
plt.figure(figsize=(10,4))
sns.countplot(x='SkinThickness', data=data, color='black')
plt.title('SkinThickness and count')
plt.xticks(rotation=90)
plt.show()
sns.lineplot(x='Glucose', y='BMI', data=data).set_title('Variation of Glucose with BMI')
Text(0.5, 1.0, 'Variation of Glucose with BMI')
sns.barplot(data['Outcome'],data['DiabetesPedigreeFunction'],color='r')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='BloodPressure', y='Age')
plt.title('BloodPressure and there Age')
plt.xlabel('BloodPressure')
plt.ylabel('Age')
plt.show()
sns.displot(data["DiabetesPedigreeFunction"])
<seaborn.axisgrid.FacetGrid at 0x14b63e7d640>
sns.relplot(x='SkinThickness',y='Age',data=data)
<seaborn.axisgrid.FacetGrid at 0x14b63e76d00>
sns.boxplot(x='Outcome',y='Age',data=data)
<AxesSubplot:xlabel='Outcome', ylabel='Age'>
sns.violinplot(x='Outcome',y='BloodPressure',data=data)
<AxesSubplot:xlabel='Outcome', ylabel='BloodPressure'>
sns.countplot(data=data, x="Pregnancies", color="yellowgreen")
<AxesSubplot:xlabel='Pregnancies', ylabel='count'>
sns.histplot(data, x="Pregnancies", hue="Outcome", multiple="stack",bins = 50, kde=True)
<AxesSubplot:xlabel='Pregnancies', ylabel='Count'>
#MODEL BUILDING
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']
X.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 |
y.head()
0 1 1 0 2 1 3 0 4 1 Name: Outcome, dtype: int64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape
((514, 8), (254, 8))
X_train.dtypes
Pregnancies int64 Glucose int64 BloodPressure int64 SkinThickness int64 Insulin int64 BMI float64 DiabetesPedigreeFunction float64 Age int64 dtype: object
from sklearn.tree import DecisionTreeClassifier
DTree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
DTree.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=3, random_state=0)
y_pred = DTree.predict(X_test)
y_pred
array([1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int64)
from sklearn.metrics import accuracy_score
print('model accuracy score with criterion gini index: {0:04f}'. format (accuracy_score(y_test, y_pred)))
model accuracy score with criterion gini index: 0.692913
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
array([[117, 51],
[ 27, 59]], dtype=int64)
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='crest', cbar=False)
<AxesSubplot:>
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)
print(class_report)
precision recall f1-score support
0 0.81 0.70 0.75 168
1 0.54 0.69 0.60 86
accuracy 0.69 254
macro avg 0.67 0.69 0.68 254
weighted avg 0.72 0.69 0.70 254
plt.figure(figsize=(12,8))
from sklearn import tree
tree.plot_tree(DTree.fit(X_train, y_train))
[Text(0.5, 0.875, 'X[1] <= 154.5\ngini = 0.457\nsamples = 514\nvalue = [332, 182]'), Text(0.25, 0.625, 'X[7] <= 30.5\ngini = 0.382\nsamples = 432\nvalue = [321, 111]'), Text(0.125, 0.375, 'X[1] <= 127.5\ngini = 0.24\nsamples = 251\nvalue = [216, 35]'), Text(0.0625, 0.125, 'gini = 0.153\nsamples = 203\nvalue = [186, 17]'), Text(0.1875, 0.125, 'gini = 0.469\nsamples = 48\nvalue = [30, 18]'), Text(0.375, 0.375, 'X[5] <= 26.95\ngini = 0.487\nsamples = 181\nvalue = [105, 76]'), Text(0.3125, 0.125, 'gini = 0.105\nsamples = 36\nvalue = [34, 2]'), Text(0.4375, 0.125, 'gini = 0.5\nsamples = 145\nvalue = [71, 74]'), Text(0.75, 0.625, 'X[5] <= 28.7\ngini = 0.232\nsamples = 82\nvalue = [11, 71]'), Text(0.625, 0.375, 'X[5] <= 25.35\ngini = 0.486\nsamples = 12\nvalue = [5, 7]'), Text(0.5625, 0.125, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'), Text(0.6875, 0.125, 'gini = 0.469\nsamples = 8\nvalue = [5, 3]'), Text(0.875, 0.375, 'X[6] <= 1.428\ngini = 0.157\nsamples = 70\nvalue = [6, 64]'), Text(0.8125, 0.125, 'gini = 0.116\nsamples = 65\nvalue = [4, 61]'), Text(0.9375, 0.125, 'gini = 0.48\nsamples = 5\nvalue = [2, 3]')]